import math
import os
import shutil
import jieba
import jieba.analyse
import pandas as pd


def gte_txt():
    raw = pd.read_csv("金庸-射雕英雄传txt精校版.txt", names=['txt'], sep='aaa', encoding='gbk', engine='python')

    def m_head(tmpstr):
        return tmpstr[:1]

    def m_mid(tmpstr):
        return tmpstr.find("回 ")

    raw['head'] = raw.txt.apply(m_head)
    raw['mid'] = raw.txt.apply(m_mid)
    raw['len'] = raw.txt.apply(len)
    chapnum = 0
    for i in range(len(raw)):
        if raw['head'][i]=='第' and raw['mid'][i]>0 and raw['len'][i]<30:
            chapnum += 1
        if chapnum >= 40 and raw['txt'][i] == "附录一：成吉思汗家族" :
            chapnum = 0
        raw.loc[i, 'chap'] = chapnum
    return raw


def chap_num(raw,num):
    tmpchap = raw[raw['chap'] == num].copy()
    chapter = tmpchap.sum()
    return chapter.txt


def stop_words():
    stopwords = []
    with open('停用词.txt', 'r', encoding='utf-8') as f:
        for line in f:
            if len(line)>0:
                stopwords.append(line.strip())
    return stopwords


def get_dict(res):
    total = 0
    all_dict = {}
    for content in res:
        word_cut = jieba.cut(content.strip())
        outstr = []
        for word in word_cut:
            if word not in stop_words():
                if word != '\t' and word != '\n':
                    outstr.append(word)
        for word in outstr:
            if word == ' ':
                outstr.remove(' ')
        dict = {}
        total += 1
        for word in outstr:
            dict[word] = 1
        for key in dict:
            count = all_dict.get(key, 0)
            all_dict[key] = count + 1
    return all_dict


def work(result):
    all_dict = get_dict(result)
    idf_dict = {}
    for key in all_dict:
        s = '%.10f' % (math.log10(40 / (all_dict[key] + 1)))
        if key > u'\u4e00' and key <= u'\u9fa5':
            idf_dict[key] = s
    print('构造结束')
    f = open('idf_dict.txt', 'w', encoding='utf-8')
    for i in idf_dict:
        if i != '\n':
            f.write(i + ' ' + idf_dict[i] + '\n')
    f.close()


def make_dirs(path):
    isExists = os.path.exists(path)
    os.makedirs(path)
    lists = []
    for i in range(0, 40):
        content = chap_num(gte_txt(), i + 1)
        with open(path + os.sep + "第" + str(i + 1) + "章.txt", 'w') as f:
            f.write(chap_num(gte_txt(), i + 1))
            print("已将第" + str(i+1) + "回内容写入文档集.txt")
        lists.append(content)
    return lists


if __name__ == '__main__':
    path = './《射雕》文档集'
    lists = make_dirs(path)
    work(lists)
    with open('金庸小说.txt',encoding='gbk') as f:
        jieba.load_userdict(f)

    jieba.analyse.set_stop_words('停用词.txt')
    jieba.analyse.set_idf_path("idf.txt.big")
    TFres1 = jieba.analyse.extract_tags(lists[0], withWeight=True,topK=10)
    print(TFres1)